%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
# calculate accuracy measures and confusion matrix/f1 score
from sklearn import metrics
from sklearn.metrics import f1_score
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"
headers = "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.names"
# Making a list of missing value types this way we can find missing values other than na and null, like any special characters
missing_values = ["n/a", "na", "--","","?"]
Parkinsons_df_orig = pd.read_csv(url,skipinitialspace=True, na_values = missing_values)
Parkinsons_df_orig.info()
Parkinsons_df = pd.DataFrame.copy(Parkinsons_df_orig) # Working on the copy of the original
print ('The Parkinsons data has {0} rows and {1} columns'.format(Parkinsons_df.shape[0],Parkinsons_df.shape[1]))
Parkinsons_df = Parkinsons_df.drop('name', axis=1) # drop ASCII Subject name or recording number
print ('Dropping the ASCII Subject name or recording number columns; this is unique column which does not help in modeling')
# Missing Values identification
x = Parkinsons_df.isna().sum().sum()
y = Parkinsons_df.isnull().sum().sum()
print ('The Parkinsons data has {} missing values and {} null values'.format(x,y))
print(Parkinsons_df.nunique()) # Total number of unique records
Parkinsons_df.describe().transpose() #descriptive Statistical analysis
print(Parkinsons_df.skew(numeric_only = True, axis = 0))
print ('The positive values of Skew indicates Right-skewed distribution and negative values Left-Skewed distribution')
# plotting Univariate distribution using distplot and reading the independent variables dispersion
col_names = Parkinsons_df.select_dtypes(include=[np.float64]).columns
fig, ax = plt.subplots(len(col_names), figsize=(15,150))
for i, col_val in enumerate(col_names):
sns.distplot(Parkinsons_df[col_val], kde = True, ax=ax[i])
ax[i].set_title('Freq dist '+col_val, fontsize=10)
ax[i].set_xlabel(col_val, fontsize=8)
ax[i].set_ylabel('Count', fontsize=8)
plt.show()
# From the skeweness and above observations in the distribution
# The following independent attributes have Mean > Median,and is right-skewed distribution
# MDVP:Fo(Hz) - Average vocal fundamental frequency
# MDVP:Fhi(Hz) - Maximum vocal fundamental frequency
# MDVP:Flo(Hz) - Minimum vocal fundamental frequency
# MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several measures of variation in fundamental frequency
# MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude
# NHR - Measure of ratio of noise to tonal components in the voice
# spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation
# D2 - Non-linear dynamical complexity measures
# The following independent attributes have mean < median, and is partially left-skewed
# HNR - Measure of ratio of noise to tonal components in the voice
# RPDE - Non-linear dynamical complexity measures
# DFA - Signal fractal scaling exponent
# Our target variable is Status - Health status of the subject (one) - Parkinson's, (zero) - healthy
# 48(healthy); 147(parkinsons)
Parkinsons_df['status'].value_counts().plot.bar(title='Healthy vs Parkinsons')
print(Parkinsons_df['status'].value_counts()) # 0 - Healthy(48) 1 - Parkinsons(147)
# Finding multi-collinearity - Bi-Variate Analysis
plt.figure(figsize=(25, 15))
ax = sns.heatmap(Parkinsons_df.corr(),annot = True, linewidths = 0.5, cmap="YlGnBu")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
# MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several measures of variation in fundamental frequency
# Jitter(%) has strong positive correlation with Jitter(abs), RAP, PPQ and DDP with R values of 0.94, 0.99,0.97
# and 0.99 respectively
# MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude
# MDVP:Shimmer has strong positive correlation with Shimmer(dB), Shimmer:APQ3, Shimmer:APQ5, MDVP:APQ and Shimmer:DDA with
# r values 0.99,0.99,0.98,0.95 and 0.98 respectively
# Second set of Variables which has high-correlation coefficient
# NHR - Measure of ratio of noise to tonal components in the voice is postively correlated with the
# Several measures of variation in fundamental frequency i.e. Jitter(%), Jitter(abs), RAP, PPQ and DDP
# with 0,91, 0.83, 0.92, 0.84 and 0.92 as measures
# spread1,PPE - Non-linear measures of fundamental frequency variation
# Multi-collinear with high correlation coefficient of 0.96
# Hence, the above set of independent attributes are multi-collinear and works counter-productive while
# building model to predict Parkinsons
# We need domain expertise/advice to decide if a feature can be dropped while building the model to predict the subject
# Amplitude can be expressed as a function of frequency as below
# A=y(t)sin(2Ï€ft+Ï•) ; f - frequency ; A Amplitude; y(t)- sinewave as function of time; t- time
# Bi-variate Analysis
sns.pairplot(Parkinsons_df, diag_kind = 'kde', hue = 'status')
# Finding Outliers for treatment - Box Plot
col_names = Parkinsons_df.select_dtypes(include=[np.float64]).columns
fig, ax = plt.subplots(len(col_names), figsize=(8,200))
for i, col_val in enumerate(col_names):
sns.boxplot(y=Parkinsons_df[col_val], ax=ax[i])
ax[i].set_title('Box plot - {}'.format(col_val), fontsize=10)
ax[i].set_xlabel(col_val, fontsize=8)
plt.show()
# IQR based outlier identification - Distplot
def IQR_based_outlier(data, threshold = 1.5):
IQR = np.quantile(data, 0.75) - np.quantile(data, 0.25)
minval = np.quantile(data,0.25) - IQR * threshold
maxval = np.quantile(data,0.75) + IQR * threshold
return (data < minval)|(data > maxval)
col_names = Parkinsons_df.select_dtypes(include=[np.float64]).columns
fig, ax = plt.subplots(len(col_names), figsize=(8,200))
for i, col_val in enumerate(col_names):
x = Parkinsons_df[col_val][:1000]
sns.distplot(x, ax=ax[i], rug=True, hist=False)
outliers = x[IQR_based_outlier(x)]
print('List of Outliers detected for - {} \n '.format(col_val),outliers)
ax[i].plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)
ax[i].set_title('Outlier detection - {}'.format(col_val), fontsize=10)
ax[i].set_xlabel(col_val, fontsize=8)
plt.show()
''' The strategy to be followed for Outlier - Treatment,should be taken based on domain expert advice
Imputing Median Values for all the right-skewed distribution/ Imputing mean values of the Outliers can be one
col_names = Parkinsons_df.select_dtypes(include=[np.float64]).columns
for i, col_val in enumerate(col_names):
median = Parkinsons_df[col_val].median()
Parkinsons_df[IQR_based_outlier(Parkinsons_df[col_val])][col_val] = median'''
Parkinsons_df.info()
Pd_target_df = Parkinsons_df['status'] # Target Variable
Pd_features_df = Parkinsons_df.drop('status', axis =1)
from sklearn.preprocessing import StandardScaler
SC = StandardScaler()
Pd_features_df = pd.DataFrame(SC.fit_transform(Pd_features_df))
from sklearn.model_selection import train_test_split
X = np.array(Pd_features_df)
y = np.array(Pd_target_df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10) # split 70:30
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'entropy', random_state = 10)
dt_model.fit(X_train, y_train)
from IPython.display import Image
#import pydotplus as pydot
from sklearn import tree
from os import system
train_char_label = ['0', '1']
PD_tree_file = open('D:\Pd_tree.dot','w')
names = Pd_features_df.columns.values
dot_data = tree.export_graphviz(dt_model, out_file= PD_tree_file, feature_names = names, class_names = list(train_char_label))
PD_tree_file.close()
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = Pd_features_df.columns))
def printModelMetrics(model, trainscore,testscore):
print('\n The training accuracy for {} is {}'.format(model, trainscore))
print('\n The testing accuracy for {} is {}'.format(model, testscore))
print('\n Confusion Matrix: \n')
print(metrics.confusion_matrix(y_test, y_predict))
print('\n Classification Report for Prediction: \n')
print(metrics.classification_report(y_test, y_predict))
print('\n F1 Scores for Prediction: \n')
print(f1_score(y_test, y_predict, average=None))
y_predict = dt_model.predict(X_test)
# calculate accuracy measures and confusion matrix
DT_train_score = dt_model.score(X_train , y_train)
DT_test_score = dt_model.score(X_test , y_test)
printModelMetrics('Decision Tree Classifier', DT_train_score, DT_test_score)
# regularizing the tree
reg_dt_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, min_samples_leaf = 1, random_state = 10)
reg_dt_model.fit(X_train, y_train)
y_predict = reg_dt_model.predict(X_test)
Reg_DT_train_score = reg_dt_model.score(X_train , y_train)
Reg_DT_test_score = reg_dt_model.score(X_test , y_test)
printModelMetrics('Regularized Decision Tree Classifier', Reg_DT_train_score, Reg_DT_test_score)
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 40, max_depth =3, min_samples_leaf = 1, random_state = 10)
rfcl = rfcl.fit(X_train, y_train)
y_predict = rfcl.predict(X_test)
rfcl_train_score = rfcl.score(X_train , y_train)
rfcl_test_score = rfcl.score(X_test , y_test)
printModelMetrics('Random Forest Classifier', rfcl_train_score, rfcl_test_score)